knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(matrixStats)
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
library(ggrepel)
##########################################################################################
## LSHTM: Maria’s data that she used for the hvCpG paper can be accessed from ing-p5 here:
# /mnt/old_user_accounts/p3/maria/PhD/Data/datasets/
#
# Her hvCpG scripts are here:
# /mnt/old_user_accounts/p3/maria/PhD/Projects/hvCpGs/Scripts/
source("/home/alice/2024_hvCpG/03_prepDatasetsMaria/dataprep_MariaArrays.R")
my_list_mat <- Maria_filtered_list_mat; rm(Maria_filtered_list_mat)
cpgnames <- unique(unlist(sapply(my_list_mat, row.names)))
cpgnames <- cpgnames[order(cpgnames)]
Maria’s paper: We defined an hvCpG in the following way:
## Within each dataset, calculate the CpGs variance, and keep the top 5%
top5pcvar <- lapply(my_list_mat, function(mat) {
# Step 1: Calculate row variances
row_variances = rowVars(mat, na.rm = T)
df = data.frame(var=row_variances, cpg=names(row_variances))
# Step 2
top = top_n(df, as.integer(0.05*nrow(df)), var) ## slightly different than quantile cause keep ties
return(top)
})
## CpGs in the top 5% variance:
cpg_counts_top <- table(unlist(lapply(top5pcvar, rownames))) %>%
data.frame() %>% dplyr::rename("cpgs"="Var1", "all_cpgs_top5pc"="Freq")
## all covered CpGs:
cpg_counts <- table(unlist(lapply(my_list_mat, rownames))) %>%
data.frame() %>% dplyr::rename("cpgs"="Var1", "all_cpgs"="Freq")
## Both:
cpg_counts_full <- merge(cpg_counts, cpg_counts_top, all.x = T)
rm(cpg_counts, cpg_counts_top)
## in 65% of datasets in which the CpG is covered (following quality control), it has methylation variance in the top 5% of all (non-removed) CpGs:
hvCpGs_maria <- na.omit(cpg_counts_full[cpg_counts_full$all_cpgs_top5pc/cpg_counts_full$all_cpgs >= 0.65,])
## rounding, to mimick Maria's approach
hvCpGs_maria <- na.omit(cpg_counts_full[round(cpg_counts_full$all_cpgs_top5pc/cpg_counts_full$all_cpgs, 2) >= 0.65,])
Maria detected 4143 hvCpGs. I detected 4377 hvCpGs. We both have 4143 hvCpGs in common.
The proportion of CpGs than Maria found hvCpGs is p(hvCpG) = 1.02%.
Within each dataset k, calculate the median sd of all CpG j
all_sd_jk <- sapply(my_list_mat, function(k){
## Scale
k = log2(k/(1-k))
get_sd_k <- function(k){
## Calculate a vector of the row (=per CpG j) sd
return(rowSds(k, na.rm = T))
}
## Return a vector of sds, in a list for each dataset
return(get_sd_k(k))
})
## Plot:
df_long <- reshape2::melt(all_sd_jk, variable.name = "Vector", value.name = "SDs")
## Plot distributions of all vectors on the same graph
ggplot(df_long, aes(x = SDs, color = L1)) +
geom_density() +
labs(title = "Distribution of SDs accross datasets",
x = "SDs",
y = "Density") +
theme_minimal() +
theme(legend.position = "none") +
scale_x_sqrt()
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_density()`).
## Calculate lambda per dataset
lambdas = sapply(all_sd_jk, function(x){quantile(x, 0.95, na.rm=T)/median(x, na.rm=T)})
names(lambdas) <- gsub(".95%", "", names(lambdas))
# Histogram with kernel density
ggplot(data.frame(lambda=lambdas, dataset=names(lambdas)),
aes(x = lambdas)) +
geom_histogram(aes(y = after_stat(density)),
colour = 1, fill = "white", binwidth = .1) +
geom_density(lwd = 1, colour = 4,
fill = 4, alpha = 0.25)+
theme_minimal() +
geom_vline(xintercept = median(lambdas), col = "red")
median(lambdas) #1.878303
## [1] 1.878303
Which datasets have a higher lambda, and why?
df = data.frame(nind=sapply(my_list_mat, ncol),
dataset=names(my_list_mat),
lambda=lambdas,
tissue = sapply(strsplit(names(my_list_mat), "_"), `[`, 1),
ethnicity=sapply(strsplit(names(my_list_mat), "_"), `[`, 2)) %>%
mutate(dataset=ifelse(dataset %in% c("Blood_Cauc", "Blood_Hisp"), "Blood_Cauc_Hisp", dataset)) %>%
mutate(dataset=ifelse(dataset %in% c("Blood_Mexican", "Blood_PuertoRican "), "Blood_Mex_PuertoRican ", dataset)) %>%
mutate(dataset=ifelse(dataset %in% c("CD4+_Estonian", "CD8+_Estonian"), "CD4+_CD8+_Estonian", dataset)) %>%
mutate(dataset=ifelse(dataset %in% c("Saliva_Hisp", "Saliva_Cauc"), "Saliva_Hisp_Cauc", dataset))
ggplot(data = df,
aes(x=lambda, y=nind))+
geom_smooth(method = "lm")+
geom_point()+
geom_label_repel(aes(label = dataset, fill=dataset), size= 2, alpha=.8, max.overlaps = 25)+
theme_bw()+theme(legend.position = "none")+
scale_x_log10()
## `geom_smooth()` using formula = 'y ~ x'
## Emphasize the outliers
df_long$col = "grey"
df_long$col[df_long$L1 %in% "BulkFrontalCortex"] <- "red"
ggplot(df_long, aes(x = SDs, group = L1, fill = col)) +
geom_density(data = df_long[!df_long$L1 %in% "BulkFrontalCortex",], alpha = .5) +
geom_density(data = df_long[df_long$L1 %in% "BulkFrontalCortex",], alpha = .6) +
scale_fill_manual(values = c("grey", "red"))+
labs(title = "Distribution of SDs accross datasets",subtitle = "Red=BulkFrontalCortex",
x = "SDs",
y = "Density") +
theme_minimal() +
theme(legend.position = "none") +
scale_x_sqrt()
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
## Emphasize the outliers
df_long$col = "grey"
df_long$col[df_long$L1 %in% "Blood_Cauc"] <- "red"
ggplot(df_long, aes(x = SDs, group = L1, fill = col)) +
geom_density(data = df_long[!df_long$L1 %in% "Blood_Cauc",], alpha = .5) +
geom_density(data = df_long[df_long$L1 %in% "Blood_Cauc",], alpha = .6) +
scale_fill_manual(values = c("grey", "red"))+
labs(title = "Distribution of SDs accross datasets",subtitle = "Red=Blood_Cauc",
x = "SDs",
y = "Density") +
theme_minimal() +
theme(legend.position = "none") +
scale_x_sqrt()
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_density()`).
## Emphasize the outlier
df_long$col = "grey"
df_long$col[df_long$L1 %in% "Blood_Hisp"] <- "red"
ggplot(df_long, aes(x = SDs, group = L1, fill = col)) +
geom_density(data = df_long[!df_long$L1 %in% "Blood_Hisp",], alpha = .5) +
geom_density(data = df_long[df_long$L1 %in% "Blood_Hisp",], alpha = .6) +
scale_fill_manual(values = c("grey", "red"))+
labs(title = "Distribution of SDs accross datasets",subtitle = "Red=Blood_Hisp",
x = "SDs",
y = "Density") +
theme_minimal() +
theme(legend.position = "none") +
scale_x_sqrt()
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 8 rows containing non-finite outside the scale range
## (`stat_density()`).
The equation is:
\[ \log\left(P(M_j)\right) = \sum_{i=1}^{n} \log\left( \sum_{Z_j=0}^{1} \left( \sum_{Z_{j,k}=0}^{1} P(M_{i,j} \mid Z_{j,k}) \times P(Z_{j,k} \mid Z_j) \right) \times P(Z_j) \right) \]
The transformation lead to weird values:
# my_list_mat$Blood_Hisp[rownames(my_list_mat$Blood_Hisp) %in% "cg23089912",1:10]
# scaled_list_mat$Blood_Hisp[rownames(scaled_list_mat$Blood_Hisp) %in% "cg23089912",1:10]
#
# test <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/Raw_cleaned_beta_matrices_GEO/Blood_Hisp")
# test[rownames(test) %in% "cg23089912",5:10]
#
# test <- readRDS("/mnt/old_user_accounts/p3/maria/PhD/Data/datasets/GEO/BMIQ + 10 PCs + age + sex OUTLIERS REMOVED/Blood_Hisp.RDS")
# test[rownames(test) %in% "cg23089912",5:10]
# # GSM1870986
# # 0.00000000000000001124546 --> give extreme value in scaling, and p dnorm=0
The zero in 8h sample was weirdly transformed. The cleaned values are very different from the original. Is that expected?
Run on LSHTM server (need to be outside of RStudio): source(“run_hvCpGdetection.R”) ## for different parameters
load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/results_NM_test3000_0.95.RDA")
results_NM_test3000_0.95 <- result %>% data.frame %>% tibble::rownames_to_column("CpG") %>%
dplyr::rename(alpha_NM_0.95=".")
load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/results_LB_test3000_0.95.RDA")
results_LB_test3000_0.95 <- result %>% data.frame %>% tibble::rownames_to_column("CpG") %>%
dplyr::rename(alpha_LB_0.95="alpha")
merge(results_NM_test3000_0.95, results_LB_test3000_0.95, by = "CpG") -> results_test3000_0.95
results_test3000_0.95 %>% mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG,
"1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% ggplot(aes(x=alpha_LB_0.95, y=alpha_NM_0.95, fill = Previous.results)) +
geom_abline(slope = 1) +
geom_point(pch=21, size = 3, alpha = .8) +
theme_bw()
Some CpGs have an alpha of 0 for LBB algorithm and different alpha for NM
results_test3000_0.95[round(results_test3000_0.95$alpha_NM_0.95, 1) != round(results_test3000_0.95$alpha_LB_0.95, 1),] %>% mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG, "1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>% melt() %>%
ggplot(aes(x=CpG, y=value, fill = variable)) +
geom_point(pch=21, size = 3, alpha = .8) +
facet_grid(Previous.results~.) + theme_bw() + theme(axis.text.x = element_blank())
## Using CpG, Previous.results as id variables
The probability of zero seems to be an artefact of the “L-BFGS-B” optimisation method, so we choose “Nelder-Mead”.
load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/results_NM_test3000_0.80.RDA")
results_NM_test3000_0.80 <- result %>% data.frame %>% tibble::rownames_to_column("CpG") %>%
dplyr::rename(alpha_NM_0.80=".")
load("/home/alice/2024_hvCpG/05_hvCpGalgorithm/resultsDir/results_NM_test3000_0.99.RDA")
results_NM_test3000_0.99 <- result %>% data.frame %>% tibble::rownames_to_column("CpG") %>%
dplyr::rename(alpha_NM_0.99=".")
merge(merge(results_NM_test3000_0.80, results_NM_test3000_0.99), results_NM_test3000_0.95) %>%
mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG,
"1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>%
melt() %>%
mutate(p0 = sub(".*_", "", variable)) %>%
ggplot(aes(x=CpG, y=value, fill = Previous.results)) +
geom_point(pch=21, size = 2, alpha = .5) +
scale_fill_manual(values = c("pink","black")) +
ylab("alpha (probability of being a hvCpG)") +
facet_grid(.~p0) + theme_bw() + theme(axis.text.x = element_blank())
## Using CpG, Previous.results as id variables
merge(merge(results_NM_test3000_0.80, results_NM_test3000_0.99), results_NM_test3000_0.95) %>%
mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG,
"1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>%
melt() %>%
mutate(p0 = sub(".*_", "", variable)) %>%
ggplot(aes(x = factor(p0), y = value, fill = Previous.results)) +
geom_point(
pch = 21, alpha = .2,
position = position_jitterdodge(
jitter.width = 0.2, # Horizontal jitter amount
dodge.width = 0.8 # Must match boxplot dodge width
)
) +
geom_boxplot(position = position_dodge(width = 0.8), alpha = .5) +
scale_fill_manual(values = c("pink", "grey")) +
xlab("p0 (probability of true negative)") +
ylab("alpha (probability of being a hvCpG)") +
theme_bw()
## Using CpG, Previous.results as id variables
P0=0.95 sounds fine, not much improvement by taking 0.99
source("../03_prepDatasetsMaria/dataprep_MariaArrays.R")
getplotfewCpG <-function(myvec, title){
listToTest <- lapply(Maria_filtered_list_mat, function(df) df[row.names(df) %in% myvec, , drop = FALSE])
# Combine the list into one data frame
mylist <- lapply(lapply(listToTest, as.data.frame), tibble::rownames_to_column, var = "CpG")
# Combine all into one long data frame
long_df <- bind_rows(mylist, .id = "Dataset")
# Pivot to long format: one row per CpG/sample/tissue
df_long <- na.omit(long_df %>%
tidyr::pivot_longer(
cols = -c(Dataset, CpG),
names_to = "Sample",
values_to = "Value"
))
# Plot with color by source data frame
ggplot(df_long, aes(x = Dataset, y = Value, color = Dataset)) +
geom_point() +
theme_bw() + theme(legend.position = "none") + facet_grid(.~CpG) +
ggtitle(title)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 5))
}
myvec1 <- results_NM_test3000_0.95 %>%
mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG,
"1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>%
arrange(., alpha_NM_0.95) %>%
filter(Previous.results %in% "1500 hvCpG in Maria's study") %>% head(n=3) %>% select(CpG) %>%
unlist()
getplotfewCpG(myvec1, "Maria's hvCpG with the 3 lowest probabilities to be one in Alice's")
myvec2 <- results_NM_test3000_0.95 %>%
mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG,
"1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>%
arrange(., desc(alpha_NM_0.95)) %>%
filter(Previous.results %in% "1500 hvCpG in Maria's study") %>% head(n=3) %>% select(CpG) %>%
unlist()
getplotfewCpG(myvec2, "Maria's hvCpG with the 3 highest probabilities to be one in Alice's")
myvec3 <- results_NM_test3000_0.95 %>%
mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG,
"1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>%
arrange(., desc(alpha_NM_0.95)) %>%
filter(!Previous.results %in% "1500 hvCpG in Maria's study") %>% head(n=3) %>% select(CpG) %>%
unlist()
getplotfewCpG(myvec3, "CpG not detected by Maria as hvCpG, with the 3 highest probabilities to be one in Alice's")
myvec4 <- results_NM_test3000_0.95 %>%
mutate(Previous.results = ifelse(CpG %in% MariasCpGs$CpG,
"1500 hvCpG in Maria's study", "1500 mQTL matching controls")) %>%
arrange(., alpha_NM_0.95) %>%
filter(!Previous.results %in% "1500 hvCpG in Maria's study") %>% head(n=3) %>% select(CpG) %>%
unlist()
getplotfewCpG(myvec4, "CpG not detected by Maria as hvCpG, with the 3 lowest probabilities to be one in Alice's")
lambdas are defined based on a 5% threshold, even if alpha is not. How to not hardcode them? MCMC on lambda? Link with alpha?